Use the Quality
of Life data (Case06_QoL_Symptom_ChronicIllness) to fit several
different Multiple Linear Regression models predicting clinically
relevant outcomes, e.g., Chronic Disease Score.
Summarize and visualize the data using summary,
str, pairs.panels, ggplot.
ql<-read.csv("I:/UBD PB/ZH5102 ML/ass7/Case06_QoL_Symptom_ChronicIllness.csv")
summary(ql)
## ID INTERVIEWDATE LANGUAGE AGE
## Min. : 171.0 Min. : 0.00 Min. :1.000 Min. :19.00
## 1st Qu.: 819.8 1st Qu.: 0.00 1st Qu.:1.000 1st Qu.:51.00
## Median :1427.5 Median : 0.00 Median :1.000 Median :59.00
## Mean :1472.0 Mean : 19.91 Mean :1.205 Mean :58.56
## 3rd Qu.:2143.2 3rd Qu.: 0.00 3rd Qu.:1.000 3rd Qu.:66.00
## Max. :2838.0 Max. :440.00 Max. :2.000 Max. :90.00
## RACE_ETHNICITY SEX QOL_Q_01 QOL_Q_02
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:1.000 1st Qu.:3.000 1st Qu.:3.000
## Median :3.000 Median :1.000 Median :4.000 Median :3.000
## Mean :3.615 Mean :1.417 Mean :3.649 Mean :3.407
## 3rd Qu.:4.000 3rd Qu.:2.000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :7.000 Max. :2.000 Max. :6.000 Max. :6.000
## QOL_Q_03 QOL_Q_04 QOL_Q_05 QOL_Q_06
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.000
## Median :4.000 Median :3.000 Median :3.000 Median :3.000
## Mean :3.701 Mean :3.022 Mean :3.117 Mean :2.937
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :6.000 Max. :6.000 Max. :6.000 Max. :6.000
## QOL_Q_07 QOL_Q_08 QOL_Q_09 QOL_Q_10
## Min. :-1.000 Min. :1.000 Min. :1.000 Min. :1.00
## 1st Qu.: 1.000 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.00
## Median : 4.000 Median :3.000 Median :3.000 Median :3.00
## Mean : 4.108 Mean :2.876 Mean :3.144 Mean :2.92
## 3rd Qu.: 7.000 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:4.00
## Max. :10.000 Max. :6.000 Max. :6.000 Max. :6.00
## MSA_Q_01 MSA_Q_02 MSA_Q_03 MSA_Q_04
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:1.000
## Median :3.000 Median :3.000 Median :2.000 Median :2.000
## Mean :2.941 Mean :3.035 Mean :2.267 Mean :2.369
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:3.000 3rd Qu.:3.000
## Max. :6.000 Max. :6.000 Max. :6.000 Max. :6.000
## MSA_Q_05 MSA_Q_06 MSA_Q_07 MSA_Q_08
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000
## Median :1.000 Median :2.000 Median :2.000 Median :1.000
## Mean :1.862 Mean :2.393 Mean :2.165 Mean :1.453
## 3rd Qu.:2.000 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:1.000
## Max. :6.000 Max. :6.000 Max. :6.000 Max. :6.000
## MSA_Q_09 MSA_Q_10 MSA_Q_11 MSA_Q_12
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000
## Median :2.000 Median :1.000 Median :1.000 Median :1.000
## Mean :2.144 Mean :1.673 Mean :2.216 Mean :2.009
## 3rd Qu.:3.000 3rd Qu.:2.000 3rd Qu.:3.000 3rd Qu.:2.000
## Max. :6.000 Max. :6.000 Max. :6.000 Max. :6.000
## MSA_Q_13 MSA_Q_14 MSA_Q_15 MSA_Q_16 MSA_Q_17
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.00
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.00
## Median :1.000 Median :1.000 Median :1.000 Median :1.000 Median :1.00
## Mean :1.874 Mean :1.968 Mean :1.744 Mean :1.807 Mean :2.06
## 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:3.00
## Max. :6.000 Max. :6.000 Max. :6.000 Max. :6.000 Max. :6.00
## PH2_Q_01 PH2_Q_02 TOS_Q_01 TOS_Q_02
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:2.000
## Median :2.000 Median :2.000 Median :2.000 Median :4.000
## Mean :2.452 Mean :2.343 Mean :2.219 Mean :3.287
## 3rd Qu.:4.000 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:4.000
## Max. :6.000 Max. :6.000 Max. :5.000 Max. :5.000
## TOS_Q_03 TOS_Q_04 CHARLSONSCORE CHRONICDISEASESCORE
## Min. :1.000 Min. :1.000 Min. :-9.0000 Min. :-9.0000
## 1st Qu.:4.000 1st Qu.:5.000 1st Qu.: 0.0000 1st Qu.: 0.7400
## Median :4.000 Median :5.000 Median : 1.0000 Median : 1.3100
## Mean :3.768 Mean :4.657 Mean : 0.8718 Mean : 0.8648
## 3rd Qu.:4.000 3rd Qu.:5.000 3rd Qu.: 1.0000 3rd Qu.: 1.9700
## Max. :5.000 Max. :6.000 Max. :10.0000 Max. : 4.7600
str(ql)
## 'data.frame': 2356 obs. of 41 variables:
## $ ID : int 171 171 172 179 180 180 181 182 183 186 ...
## $ INTERVIEWDATE : int 0 427 0 0 0 42 0 0 0 0 ...
## $ LANGUAGE : int 1 1 1 1 1 1 1 1 1 2 ...
## $ AGE : int 49 49 62 44 64 64 52 48 49 78 ...
## $ RACE_ETHNICITY : int 3 3 3 7 3 3 3 3 3 4 ...
## $ SEX : int 2 2 2 2 1 1 2 1 1 1 ...
## $ QOL_Q_01 : int 4 4 3 6 3 3 4 2 3 5 ...
## $ QOL_Q_02 : int 4 3 3 6 2 5 4 1 4 6 ...
## $ QOL_Q_03 : int 4 4 4 6 3 6 4 3 4 4 ...
## $ QOL_Q_04 : int 4 4 2 6 3 6 2 2 5 2 ...
## $ QOL_Q_05 : int 1 5 4 6 2 6 4 3 4 3 ...
## $ QOL_Q_06 : int 4 4 2 6 1 2 4 1 2 4 ...
## $ QOL_Q_07 : int 1 2 5 -1 0 5 8 4 3 7 ...
## $ QOL_Q_08 : int 6 1 3 6 6 6 3 1 2 4 ...
## $ QOL_Q_09 : int 3 4 3 6 2 2 4 2 2 4 ...
## $ QOL_Q_10 : int 3 1 3 6 3 6 3 2 4 3 ...
## $ MSA_Q_01 : int 1 3 2 6 2 3 4 1 1 2 ...
## $ MSA_Q_02 : int 1 1 2 6 1 6 4 3 2 4 ...
## $ MSA_Q_03 : int 2 1 2 6 1 2 3 3 1 2 ...
## $ MSA_Q_04 : int 1 3 2 6 1 2 1 4 1 5 ...
## $ MSA_Q_05 : int 1 1 1 6 1 2 1 6 3 2 ...
## $ MSA_Q_06 : int 1 2 2 6 1 2 1 1 2 2 ...
## $ MSA_Q_07 : int 2 1 3 6 1 1 1 1 1 5 ...
## $ MSA_Q_08 : int 1 1 1 6 1 1 1 1 2 1 ...
## $ MSA_Q_09 : int 1 1 1 6 2 2 4 6 2 1 ...
## $ MSA_Q_10 : int 1 1 1 6 1 1 1 1 1 3 ...
## $ MSA_Q_11 : int 2 3 2 6 1 1 2 1 1 5 ...
## $ MSA_Q_12 : int 1 1 2 6 1 1 2 6 1 3 ...
## $ MSA_Q_13 : int 1 1 1 6 1 6 2 1 4 2 ...
## $ MSA_Q_14 : int 1 1 1 6 1 2 1 1 3 1 ...
## $ MSA_Q_15 : int 2 1 1 6 1 1 3 2 1 3 ...
## $ MSA_Q_16 : int 2 3 5 6 1 2 1 2 1 2 ...
## $ MSA_Q_17 : int 2 1 1 6 1 1 1 1 1 3 ...
## $ PH2_Q_01 : int 3 2 1 5 1 1 3 1 2 3 ...
## $ PH2_Q_02 : int 4 4 1 5 1 2 1 1 4 2 ...
## $ TOS_Q_01 : int 2 2 2 4 1 1 2 2 1 1 ...
## $ TOS_Q_02 : int 1 1 1 4 4 4 1 2 4 4 ...
## $ TOS_Q_03 : int 4 4 4 4 4 4 4 4 4 4 ...
## $ TOS_Q_04 : int 5 5 5 5 5 5 5 5 5 5 ...
## $ CHARLSONSCORE : int 2 2 3 1 0 0 2 8 0 1 ...
## $ CHRONICDISEASESCORE: num 1.6 1.6 1.54 2.97 1.28 1.28 1.31 1.67 2.21 2.51 ...
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.3
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'forcats' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.0 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.1.8
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(dplyr)
ql$LANGUAGE<-as.factor(recode(ql$LANGUAGE,"1" = "ENGLISH","2" = "SPANISH"))
ql$SEX<-as.factor(recode(ql$SEX,"1" = "FEMALE","2" = "MALE"))
ql$RACE_ETHNICITY<-as.factor(recode(ql$RACE_ETHNICITY,"1" = "AI-ALN","2" = "ASIAN","3" = "BLACK","4" = "HISPANIC","5"= "NHPIS","6" = "WHITE","7" = "UNKNOWN"))
ql$QOL_Q_07<-as.integer(ifelse(ql$QOL_Q_07<0, "NA", ql$QOL_Q_07))
## Warning: NAs introduced by coercion
ql$CHARLSONSCORE<-as.integer(ifelse(ql$CHARLSONSCORE<0, "NA", ql$CHARLSONSCORE))
## Warning: NAs introduced by coercion
ql$CHRONICDISEASESCORE<-as.integer(ifelse(ql$CHRONICDISEASESCORE<0, "NA", ql$CHRONICDISEASESCORE))
## Warning: NAs introduced by coercion
ql <- na.omit(ql[,c(-1,-2)])
library(psych)
## Warning: package 'psych' was built under R version 4.2.3
##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
pairs.panels(ql[,c(2,38:39)])
library(ggplot2)
agep<-ggplot(data=ql, aes(x=AGE)) +
geom_bar(stat="bin", width=0.7, fill="steelblue")
agep
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
csp<-ggplot(data=ql, aes(x=CHARLSONSCORE)) +
geom_bar(stat="bin", width=0.7, fill="steelblue")
csp
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
cdsp<-ggplot(data=ql, aes(x=CHRONICDISEASESCORE)) +
geom_bar(stat="bin", width=0.7, fill="steelblue")
cdsp
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
library(GGally)
## Warning: package 'GGally' was built under R version 4.2.3
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
g_position <- ggpairs(data=ql[,c(2,4,38:39)], title="Quality of Life by Gender",
mapping=ggplot2::aes(colour = SEX),
lower=list(combo=wrap("facethist",binwidth=1)))
g_position
```
Report paired correlations for numeric data and try to visualize these (e.g., heatmap, pairs plot, etc.)
cor(ql[,c(2,38:39)])
## AGE CHARLSONSCORE CHRONICDISEASESCORE
## AGE 1.00000000 0.08466444 0.09067902
## CHARLSONSCORE 0.08466444 1.00000000 0.19488969
## CHRONICDISEASESCORE 0.09067902 0.19488969 1.00000000
my_colors <- colorRampPalette(c("cyan", "blue","navyblue", "deeppink3"))
heatmap(cor(ql[,c(2,38:39)]),Rowv = NA, Colv = NA, col=my_colors(256))
library(plotly)
## Warning: package 'plotly' was built under R version 4.2.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
plot_ly(ql) %>%
add_trace(type = 'splom', dimensions = list( list(label='Age', values=~AGE),
list(label='Charlson Score', values=~CHARLSONSCORE), list(label='Chronic Disease Score', values=~CHRONICDISEASESCORE)),
marker = list(color = as.integer(ql$SEX),
size = 7, line = list(width = 1, color = 'rgb(230,230,230)')
)
) %>%
layout(title= 'QL Pairs Plot', hovermode='closest', dragmode= 'select',
plot_bgcolor='rgba(240,240,240, 0.95)')
Examine potential dependencies of the predictors and the dependent response variables
# Low correlation coefficient indicate independencies or no multicollinearity among the numerical variable.
# Response variables are identified to be Chronic Disease Score or Charlson Score as they are the outcome/ output influenced by demographic predictors(Age, Gender, etc).Fit a Multiple Linear Regression model, report the results, and explain the summary, residuals, effect-size coefficients, and the coefficient of determination, \(R^2\).
fit <- lm(CHRONICDISEASESCORE ~ ., data=ql)
summary(fit)
##
## Call:
## lm(formula = CHRONICDISEASESCORE ~ ., data = ql)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.07812 -0.66297 -0.00553 0.41054 2.88359
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.815e-01 2.218e-01 0.818 0.413246
## LANGUAGESPANISH -9.785e-02 6.870e-02 -1.424 0.154506
## AGE 5.211e-03 1.583e-03 3.293 0.001009 **
## RACE_ETHNICITYASIAN -1.012e-01 1.694e-01 -0.598 0.550183
## RACE_ETHNICITYBLACK -1.451e-01 1.561e-01 -0.930 0.352638
## RACE_ETHNICITYHISPANIC 1.233e-01 1.534e-01 0.804 0.421418
## RACE_ETHNICITYNHPIS -1.365e-02 3.592e-01 -0.038 0.969692
## RACE_ETHNICITYUNKNOWN 8.122e-02 1.709e-01 0.475 0.634635
## RACE_ETHNICITYWHITE -7.069e-02 1.642e-01 -0.431 0.666853
## SEXMALE -2.463e-02 3.905e-02 -0.631 0.528230
## QOL_Q_01 -1.204e-02 2.360e-02 -0.510 0.610119
## QOL_Q_02 7.079e-02 2.120e-02 3.340 0.000855 ***
## QOL_Q_03 1.674e-02 2.484e-02 0.674 0.500336
## QOL_Q_04 -1.785e-02 1.902e-02 -0.938 0.348123
## QOL_Q_05 1.507e-02 1.921e-02 0.784 0.432859
## QOL_Q_06 3.537e-02 1.695e-02 2.086 0.037091 *
## QOL_Q_07 3.027e-02 7.965e-03 3.801 0.000149 ***
## QOL_Q_08 -1.084e-02 1.950e-02 -0.556 0.578522
## QOL_Q_09 -3.519e-02 2.067e-02 -1.703 0.088799 .
## QOL_Q_10 1.267e-03 1.734e-02 0.073 0.941761
## MSA_Q_01 1.149e-02 1.728e-02 0.665 0.506358
## MSA_Q_02 1.576e-03 1.694e-02 0.093 0.925879
## MSA_Q_03 -1.145e-02 1.531e-02 -0.748 0.454601
## MSA_Q_04 3.190e-03 1.469e-02 0.217 0.828171
## MSA_Q_05 -7.064e-03 1.590e-02 -0.444 0.656881
## MSA_Q_06 3.581e-02 1.562e-02 2.293 0.021937 *
## MSA_Q_07 -3.610e-03 1.451e-02 -0.249 0.803541
## MSA_Q_08 -2.177e-02 1.847e-02 -1.179 0.238545
## MSA_Q_09 4.984e-04 1.522e-02 0.033 0.973878
## MSA_Q_10 9.468e-03 1.555e-02 0.609 0.542755
## MSA_Q_11 -9.008e-03 1.165e-02 -0.773 0.439569
## MSA_Q_12 1.211e-02 1.396e-02 0.867 0.386050
## MSA_Q_13 2.348e-02 1.493e-02 1.573 0.115810
## MSA_Q_14 -3.481e-05 1.576e-02 -0.002 0.998237
## MSA_Q_15 -2.012e-02 1.525e-02 -1.319 0.187279
## MSA_Q_16 -3.469e-03 1.447e-02 -0.240 0.810561
## MSA_Q_17 1.160e-02 1.394e-02 0.832 0.405433
## PH2_Q_01 6.986e-03 1.721e-02 0.406 0.684843
## PH2_Q_02 3.210e-02 1.889e-02 1.699 0.089451 .
## TOS_Q_01 -2.149e-02 1.967e-02 -1.093 0.274600
## TOS_Q_02 -8.441e-03 1.465e-02 -0.576 0.564599
## TOS_Q_03 1.124e-02 5.983e-02 0.188 0.850949
## TOS_Q_04 -1.237e-02 4.699e-02 -0.263 0.792354
## CHARLSONSCORE 1.163e-01 1.440e-02 8.079 1.14e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7918 on 1944 degrees of freedom
## Multiple R-squared: 0.1077, Adjusted R-squared: 0.08798
## F-statistic: 5.458 on 43 and 1944 DF, p-value: < 2.2e-16
#F Statistics is significant, at least one predictor is related to response variable.
# The Significant variables are AGE, QOL_Q_02, QOL_Q_06 ,QOL_Q_07, MSA_Q_06 & CHARLSONSCORE.
# Every per unit increase by AGE resulted in 0.0052 increase in CHRONICDISEASESCORE.
# Every per unit increase by QOL_Q_02, QOL_Q_06 ,QOL_Q_07 and MSA_Q_06 resulted in 0.071,0.035, 0.030 and 0.036 increase in CHRONICDISEASESCORE respectively.
# Every per unit increase by CHARLSONSCORE resulted in 0.12 increase in CHRONICDISEASESCORE.
# The range of actual and predicted value have a median of -0.006 & range of 5.03451 (-2.08 to 2.88)
# Rsq of 0.09 indicates a weak relationship between the independent and dependent variables.Draw model diagnostic plots, at least QQ plot, residuals plot and leverage plot (half norm plot)
# Residual Plot
plot_ly(x=fit$fitted.values, y=fit$residuals, type="scatter", mode="markers") %>%
layout(title="LM: Fitted-values vs. Model-Residuals",
xaxis=list(title="Fitted"),
yaxis = list(title="Residuals"))
QQ <- qqplot(fit$fitted.values, fit$residuals, plot.it=FALSE)
plot_ly() %>%
add_markers(x=~QQ$x, y=~QQ$y, name="Quantiles Scatter", type="scatter", mode="markers") %>%
add_trace(x = ~c(0.2,2.4), y = ~c(-2,3), type="scatter", mode="lines",
line = list(color = "red", width = 4), name="Line", showlegend=F) %>%
layout(title='Quantile plot',
xaxis = list(title="Fitted"),
yaxis = list(title="Residuals"),
legend = list(orientation = 'h'))
library(faraway)
## Warning: package 'faraway' was built under R version 4.2.3
##
## Attaching package: 'faraway'
## The following object is masked from 'package:GGally':
##
## happy
## The following object is masked from 'package:psych':
##
## logit
halfnorm(lm.influence(fit)$hat, nlab = 2, ylab="Leverages")
ql[c(255,783),]
## LANGUAGE AGE RACE_ETHNICITY SEX QOL_Q_01 QOL_Q_02 QOL_Q_03 QOL_Q_04
## 296 ENGLISH 40 NHPIS MALE 2 2 2 1
## 923 ENGLISH 58 NHPIS MALE 3 4 3 3
## QOL_Q_05 QOL_Q_06 QOL_Q_07 QOL_Q_08 QOL_Q_09 QOL_Q_10 MSA_Q_01 MSA_Q_02
## 296 2 1 1 1 2 2 2 2
## 923 3 2 1 1 3 1 1 2
## MSA_Q_03 MSA_Q_04 MSA_Q_05 MSA_Q_06 MSA_Q_07 MSA_Q_08 MSA_Q_09 MSA_Q_10
## 296 1 2 1 1 1 6 1 2
## 923 2 1 1 1 1 1 1 1
## MSA_Q_11 MSA_Q_12 MSA_Q_13 MSA_Q_14 MSA_Q_15 MSA_Q_16 MSA_Q_17 PH2_Q_01
## 296 1 3 2 1 1 1 1 5
## 923 1 1 1 1 6 6 1 1
## PH2_Q_02 TOS_Q_01 TOS_Q_02 TOS_Q_03 TOS_Q_04 CHARLSONSCORE
## 296 1 1 4 4 5 0
## 923 1 1 4 4 5 1
## CHRONICDISEASESCORE
## 296 0
## 923 2Predict outcomes for new data
# Categorize by age 30 above and below.
ql$AGE<-ifelse(ql$AGE>=30, 1, 0)
# Include only the significant and demographic variables of previous data.
fit2<-lm(CHRONICDISEASESCORE ~ ., data=ql[,c(1:4,6,10:11,20,38:39)])
summary(fit2)
##
## Call:
## lm(formula = CHRONICDISEASESCORE ~ ., data = ql[, c(1:4, 6, 10:11,
## 20, 38:39)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0525 -0.6765 -0.0041 0.3924 2.9009
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.082160 0.211861 -0.388 0.698205
## LANGUAGESPANISH -0.109363 0.067368 -1.623 0.104671
## AGE 0.484972 0.136504 3.553 0.000390 ***
## RACE_ETHNICITYASIAN -0.079565 0.167137 -0.476 0.634093
## RACE_ETHNICITYBLACK -0.130269 0.153212 -0.850 0.395288
## RACE_ETHNICITYHISPANIC 0.126918 0.151539 0.838 0.402396
## RACE_ETHNICITYNHPIS -0.054760 0.355878 -0.154 0.877727
## RACE_ETHNICITYUNKNOWN 0.085692 0.168143 0.510 0.610364
## RACE_ETHNICITYWHITE -0.072733 0.161392 -0.451 0.652284
## SEXMALE -0.044426 0.036931 -1.203 0.229150
## QOL_Q_02 0.066253 0.017335 3.822 0.000136 ***
## QOL_Q_06 0.034672 0.015321 2.263 0.023743 *
## QOL_Q_07 0.029484 0.006553 4.499 7.22e-06 ***
## MSA_Q_06 0.037547 0.013629 2.755 0.005922 **
## CHARLSONSCORE 0.116103 0.014109 8.229 3.38e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7895 on 1973 degrees of freedom
## Multiple R-squared: 0.09968, Adjusted R-squared: 0.09329
## F-statistic: 15.6 on 14 and 1973 DF, p-value: < 2.2e-16
# Significant Variables are similar to previous data.
# Coefficients of AGE have increased drastically compare with previous data.Try to improve the model performance using step
function based on AIC and BIC.
step(fit2,direction = "backward")
## Start: AIC=-925.04
## CHRONICDISEASESCORE ~ LANGUAGE + AGE + RACE_ETHNICITY + SEX +
## QOL_Q_02 + QOL_Q_06 + QOL_Q_07 + MSA_Q_06 + CHARLSONSCORE
##
## Df Sum of Sq RSS AIC
## - SEX 1 0.902 1230.5 -925.58
## <none> 1229.7 -925.04
## - LANGUAGE 1 1.642 1231.3 -924.38
## - QOL_Q_06 1 3.192 1232.8 -921.88
## - MSA_Q_06 1 4.731 1234.4 -919.40
## - RACE_ETHNICITY 6 11.217 1240.9 -918.98
## - AGE 1 7.867 1237.5 -914.36
## - QOL_Q_02 1 9.104 1238.8 -912.37
## - QOL_Q_07 1 12.616 1242.3 -906.74
## - CHARLSONSCORE 1 42.206 1271.9 -859.95
##
## Step: AIC=-925.58
## CHRONICDISEASESCORE ~ LANGUAGE + AGE + RACE_ETHNICITY + QOL_Q_02 +
## QOL_Q_06 + QOL_Q_07 + MSA_Q_06 + CHARLSONSCORE
##
## Df Sum of Sq RSS AIC
## <none> 1230.5 -925.58
## - LANGUAGE 1 1.468 1232.0 -925.21
## - QOL_Q_06 1 3.181 1233.7 -922.45
## - RACE_ETHNICITY 6 10.758 1241.3 -920.27
## - MSA_Q_06 1 4.713 1235.3 -919.98
## - AGE 1 8.530 1239.1 -913.85
## - QOL_Q_02 1 8.930 1239.5 -913.21
## - QOL_Q_07 1 13.691 1244.2 -905.58
## - CHARLSONSCORE 1 41.465 1272.0 -861.70
##
## Call:
## lm(formula = CHRONICDISEASESCORE ~ LANGUAGE + AGE + RACE_ETHNICITY +
## QOL_Q_02 + QOL_Q_06 + QOL_Q_07 + MSA_Q_06 + CHARLSONSCORE,
## data = ql[, c(1:4, 6, 10:11, 20, 38:39)])
##
## Coefficients:
## (Intercept) LANGUAGESPANISH AGE
## -0.11835 -0.10306 0.50221
## RACE_ETHNICITYASIAN RACE_ETHNICITYBLACK RACE_ETHNICITYHISPANIC
## -0.07789 -0.12875 0.12157
## RACE_ETHNICITYNHPIS RACE_ETHNICITYUNKNOWN RACE_ETHNICITYWHITE
## -0.05902 0.08367 -0.07806
## QOL_Q_02 QOL_Q_06 QOL_Q_07
## 0.06558 0.03461 0.03047
## MSA_Q_06 CHARLSONSCORE
## 0.03748 0.11466
step(fit2,direction = "forward") ## retain all variables
## Start: AIC=-925.04
## CHRONICDISEASESCORE ~ LANGUAGE + AGE + RACE_ETHNICITY + SEX +
## QOL_Q_02 + QOL_Q_06 + QOL_Q_07 + MSA_Q_06 + CHARLSONSCORE
##
## Call:
## lm(formula = CHRONICDISEASESCORE ~ LANGUAGE + AGE + RACE_ETHNICITY +
## SEX + QOL_Q_02 + QOL_Q_06 + QOL_Q_07 + MSA_Q_06 + CHARLSONSCORE,
## data = ql[, c(1:4, 6, 10:11, 20, 38:39)])
##
## Coefficients:
## (Intercept) LANGUAGESPANISH AGE
## -0.08216 -0.10936 0.48497
## RACE_ETHNICITYASIAN RACE_ETHNICITYBLACK RACE_ETHNICITYHISPANIC
## -0.07956 -0.13027 0.12692
## RACE_ETHNICITYNHPIS RACE_ETHNICITYUNKNOWN RACE_ETHNICITYWHITE
## -0.05476 0.08569 -0.07273
## SEXMALE QOL_Q_02 QOL_Q_06
## -0.04443 0.06625 0.03467
## QOL_Q_07 MSA_Q_06 CHARLSONSCORE
## 0.02948 0.03755 0.11610
step(fit2,direction = "both") ## Identical to backward
## Start: AIC=-925.04
## CHRONICDISEASESCORE ~ LANGUAGE + AGE + RACE_ETHNICITY + SEX +
## QOL_Q_02 + QOL_Q_06 + QOL_Q_07 + MSA_Q_06 + CHARLSONSCORE
##
## Df Sum of Sq RSS AIC
## - SEX 1 0.902 1230.5 -925.58
## <none> 1229.7 -925.04
## - LANGUAGE 1 1.642 1231.3 -924.38
## - QOL_Q_06 1 3.192 1232.8 -921.88
## - MSA_Q_06 1 4.731 1234.4 -919.40
## - RACE_ETHNICITY 6 11.217 1240.9 -918.98
## - AGE 1 7.867 1237.5 -914.36
## - QOL_Q_02 1 9.104 1238.8 -912.37
## - QOL_Q_07 1 12.616 1242.3 -906.74
## - CHARLSONSCORE 1 42.206 1271.9 -859.95
##
## Step: AIC=-925.58
## CHRONICDISEASESCORE ~ LANGUAGE + AGE + RACE_ETHNICITY + QOL_Q_02 +
## QOL_Q_06 + QOL_Q_07 + MSA_Q_06 + CHARLSONSCORE
##
## Df Sum of Sq RSS AIC
## <none> 1230.5 -925.58
## - LANGUAGE 1 1.468 1232.0 -925.21
## + SEX 1 0.902 1229.7 -925.04
## - QOL_Q_06 1 3.181 1233.7 -922.45
## - RACE_ETHNICITY 6 10.758 1241.3 -920.27
## - MSA_Q_06 1 4.713 1235.3 -919.98
## - AGE 1 8.530 1239.1 -913.85
## - QOL_Q_02 1 8.930 1239.5 -913.21
## - QOL_Q_07 1 13.691 1244.2 -905.58
## - CHARLSONSCORE 1 41.465 1272.0 -861.70
##
## Call:
## lm(formula = CHRONICDISEASESCORE ~ LANGUAGE + AGE + RACE_ETHNICITY +
## QOL_Q_02 + QOL_Q_06 + QOL_Q_07 + MSA_Q_06 + CHARLSONSCORE,
## data = ql[, c(1:4, 6, 10:11, 20, 38:39)])
##
## Coefficients:
## (Intercept) LANGUAGESPANISH AGE
## -0.11835 -0.10306 0.50221
## RACE_ETHNICITYASIAN RACE_ETHNICITYBLACK RACE_ETHNICITYHISPANIC
## -0.07789 -0.12875 0.12157
## RACE_ETHNICITYNHPIS RACE_ETHNICITYUNKNOWN RACE_ETHNICITYWHITE
## -0.05902 0.08367 -0.07806
## QOL_Q_02 QOL_Q_06 QOL_Q_07
## 0.06558 0.03461 0.03047
## MSA_Q_06 CHARLSONSCORE
## 0.03748 0.11466
fit3<-step(fit2,k=2,direction = "backward")
## Start: AIC=-925.04
## CHRONICDISEASESCORE ~ LANGUAGE + AGE + RACE_ETHNICITY + SEX +
## QOL_Q_02 + QOL_Q_06 + QOL_Q_07 + MSA_Q_06 + CHARLSONSCORE
##
## Df Sum of Sq RSS AIC
## - SEX 1 0.902 1230.5 -925.58
## <none> 1229.7 -925.04
## - LANGUAGE 1 1.642 1231.3 -924.38
## - QOL_Q_06 1 3.192 1232.8 -921.88
## - MSA_Q_06 1 4.731 1234.4 -919.40
## - RACE_ETHNICITY 6 11.217 1240.9 -918.98
## - AGE 1 7.867 1237.5 -914.36
## - QOL_Q_02 1 9.104 1238.8 -912.37
## - QOL_Q_07 1 12.616 1242.3 -906.74
## - CHARLSONSCORE 1 42.206 1271.9 -859.95
##
## Step: AIC=-925.58
## CHRONICDISEASESCORE ~ LANGUAGE + AGE + RACE_ETHNICITY + QOL_Q_02 +
## QOL_Q_06 + QOL_Q_07 + MSA_Q_06 + CHARLSONSCORE
##
## Df Sum of Sq RSS AIC
## <none> 1230.5 -925.58
## - LANGUAGE 1 1.468 1232.0 -925.21
## - QOL_Q_06 1 3.181 1233.7 -922.45
## - RACE_ETHNICITY 6 10.758 1241.3 -920.27
## - MSA_Q_06 1 4.713 1235.3 -919.98
## - AGE 1 8.530 1239.1 -913.85
## - QOL_Q_02 1 8.930 1239.5 -913.21
## - QOL_Q_07 1 13.691 1244.2 -905.58
## - CHARLSONSCORE 1 41.465 1272.0 -861.70
step(fit2,k=log(nrow(ql)),direction = "backward")
## Start: AIC=-841.11
## CHRONICDISEASESCORE ~ LANGUAGE + AGE + RACE_ETHNICITY + SEX +
## QOL_Q_02 + QOL_Q_06 + QOL_Q_07 + MSA_Q_06 + CHARLSONSCORE
##
## Df Sum of Sq RSS AIC
## - RACE_ETHNICITY 6 11.217 1240.9 -868.63
## - SEX 1 0.902 1230.5 -847.25
## - LANGUAGE 1 1.642 1231.3 -846.05
## - QOL_Q_06 1 3.192 1232.8 -843.55
## <none> 1229.7 -841.11
## - MSA_Q_06 1 4.731 1234.4 -841.07
## - AGE 1 7.867 1237.5 -836.03
## - QOL_Q_02 1 9.104 1238.8 -834.04
## - QOL_Q_07 1 12.616 1242.3 -828.42
## - CHARLSONSCORE 1 42.206 1271.9 -781.62
##
## Step: AIC=-868.63
## CHRONICDISEASESCORE ~ LANGUAGE + AGE + SEX + QOL_Q_02 + QOL_Q_06 +
## QOL_Q_07 + MSA_Q_06 + CHARLSONSCORE
##
## Df Sum of Sq RSS AIC
## - SEX 1 0.443 1241.3 -875.52
## - LANGUAGE 1 2.531 1243.4 -872.18
## - QOL_Q_06 1 2.744 1243.6 -871.83
## - MSA_Q_06 1 4.419 1245.3 -869.16
## <none> 1240.9 -868.63
## - AGE 1 7.183 1248.0 -864.75
## - QOL_Q_02 1 9.834 1250.7 -860.53
## - QOL_Q_07 1 12.201 1253.1 -856.77
## - CHARLSONSCORE 1 42.535 1283.4 -809.22
##
## Step: AIC=-875.52
## CHRONICDISEASESCORE ~ LANGUAGE + AGE + QOL_Q_02 + QOL_Q_06 +
## QOL_Q_07 + MSA_Q_06 + CHARLSONSCORE
##
## Df Sum of Sq RSS AIC
## - LANGUAGE 1 2.590 1243.9 -878.97
## - QOL_Q_06 1 2.751 1244.1 -878.71
## - MSA_Q_06 1 4.399 1245.7 -876.08
## <none> 1241.3 -875.52
## - AGE 1 7.685 1249.0 -870.84
## - QOL_Q_02 1 9.698 1251.0 -867.64
## - QOL_Q_07 1 13.009 1254.3 -862.38
## - CHARLSONSCORE 1 42.102 1283.4 -816.80
##
## Step: AIC=-878.97
## CHRONICDISEASESCORE ~ AGE + QOL_Q_02 + QOL_Q_06 + QOL_Q_07 +
## MSA_Q_06 + CHARLSONSCORE
##
## Df Sum of Sq RSS AIC
## - QOL_Q_06 1 3.128 1247.0 -881.57
## - MSA_Q_06 1 4.516 1248.4 -879.36
## <none> 1243.9 -878.97
## - AGE 1 7.594 1251.5 -874.46
## - QOL_Q_02 1 10.116 1254.0 -870.46
## - QOL_Q_07 1 12.111 1256.0 -867.30
## - CHARLSONSCORE 1 42.776 1286.7 -819.35
##
## Step: AIC=-881.57
## CHRONICDISEASESCORE ~ AGE + QOL_Q_02 + QOL_Q_07 + MSA_Q_06 +
## CHARLSONSCORE
##
## Df Sum of Sq RSS AIC
## <none> 1247.0 -881.57
## - MSA_Q_06 1 5.283 1252.3 -880.76
## - AGE 1 8.191 1255.2 -876.15
## - QOL_Q_02 1 14.145 1261.2 -866.74
## - QOL_Q_07 1 17.065 1264.1 -862.14
## - CHARLSONSCORE 1 46.047 1293.1 -817.08
##
## Call:
## lm(formula = CHRONICDISEASESCORE ~ AGE + QOL_Q_02 + QOL_Q_07 +
## MSA_Q_06 + CHARLSONSCORE, data = ql[, c(1:4, 6, 10:11, 20,
## 38:39)])
##
## Coefficients:
## (Intercept) AGE QOL_Q_02 QOL_Q_07 MSA_Q_06
## -0.13947 0.48938 0.07956 0.03243 0.03934
## CHARLSONSCORE
## 0.11983
# fit3: AIC has lower value than BIC, thus better fit model.
summary(fit3)
##
## Call:
## lm(formula = CHRONICDISEASESCORE ~ LANGUAGE + AGE + RACE_ETHNICITY +
## QOL_Q_02 + QOL_Q_06 + QOL_Q_07 + MSA_Q_06 + CHARLSONSCORE,
## data = ql[, c(1:4, 6, 10:11, 20, 38:39)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.01931 -0.67205 -0.00315 0.38513 2.91531
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.118348 0.209738 -0.564 0.572637
## LANGUAGESPANISH -0.103063 0.067172 -1.534 0.125112
## AGE 0.502214 0.135765 3.699 0.000222 ***
## RACE_ETHNICITYASIAN -0.077889 0.167150 -0.466 0.641278
## RACE_ETHNICITYBLACK -0.128755 0.153224 -0.840 0.400839
## RACE_ETHNICITYHISPANIC 0.121570 0.151491 0.802 0.422364
## RACE_ETHNICITYNHPIS -0.059023 0.355901 -0.166 0.868299
## RACE_ETHNICITYUNKNOWN 0.083667 0.168154 0.498 0.618848
## RACE_ETHNICITYWHITE -0.078055 0.161349 -0.484 0.628606
## QOL_Q_02 0.065581 0.017328 3.785 0.000158 ***
## QOL_Q_06 0.034615 0.015323 2.259 0.023992 *
## QOL_Q_07 0.030472 0.006502 4.686 2.97e-06 ***
## MSA_Q_06 0.037479 0.013630 2.750 0.006019 **
## CHARLSONSCORE 0.114663 0.014059 8.156 6.10e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7895 on 1974 degrees of freedom
## Multiple R-squared: 0.09902, Adjusted R-squared: 0.09309
## F-statistic: 16.69 on 13 and 1974 DF, p-value: < 2.2e-16
#F Statistics is significant, at least one predictor is related to response variable.
# Identical significant variables as previous new data.
# Every per unit increase by AGE resulted in a 0.502 increase in CHRONICDISEASESCORE.
# Every per unit increase by QOL_Q_02, QOL_Q_06, QOL_Q_07 and MSA_Q_06 resulted in 0.066,0.035, 0.030 and 0.037 increase in CHRONICDISEASESCORE respectively.
# Every per unit increase by CHARLSONSCORE resulted in 0.114 increase in CHRONICDISEASESCORE.
# The range of actual and predicted value have a median of -0.003 & range of 5 (-2.02 to 2.92)
# Rsq of 0.09 indicates a weak relationship between the independent and dependent variables.Fit a regression tree model and compare with OLS model.
set.seed(1234)
train_index <- sample(seq_len(nrow(ql)), size = 0.75*nrow(ql))
ql_train<-ql[train_index, ]
ql_test<-ql[-train_index, ]
summary(lm(CHRONICDISEASESCORE~., data=ql_train))
##
## Call:
## lm(formula = CHRONICDISEASESCORE ~ ., data = ql_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.14883 -0.66731 -0.00825 0.38780 2.90044
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.0257307 0.2799454 -0.092 0.92678
## LANGUAGESPANISH -0.0770432 0.0790493 -0.975 0.32991
## AGE 0.4571456 0.1598073 2.861 0.00429 **
## RACE_ETHNICITYASIAN -0.0597598 0.1931044 -0.309 0.75701
## RACE_ETHNICITYBLACK -0.0907984 0.1774578 -0.512 0.60897
## RACE_ETHNICITYHISPANIC 0.1616797 0.1738576 0.930 0.35255
## RACE_ETHNICITYNHPIS -0.0920479 0.4360994 -0.211 0.83286
## RACE_ETHNICITYUNKNOWN 0.1110545 0.1964899 0.565 0.57203
## RACE_ETHNICITYWHITE -0.0246570 0.1864585 -0.132 0.89481
## SEXMALE -0.0598108 0.0449652 -1.330 0.18368
## QOL_Q_01 -0.0004913 0.0279118 -0.018 0.98596
## QOL_Q_02 0.0733640 0.0259839 2.823 0.00482 **
## QOL_Q_03 0.0005606 0.0294120 0.019 0.98480
## QOL_Q_04 -0.0231842 0.0225800 -1.027 0.30471
## QOL_Q_05 0.0221581 0.0226104 0.980 0.32725
## QOL_Q_06 0.0579824 0.0200179 2.897 0.00383 **
## QOL_Q_07 0.0213856 0.0091307 2.342 0.01931 *
## QOL_Q_08 -0.0131849 0.0229508 -0.574 0.56573
## QOL_Q_09 -0.0518087 0.0250643 -2.067 0.03891 *
## QOL_Q_10 -0.0022962 0.0199863 -0.115 0.90855
## MSA_Q_01 0.0415030 0.0200249 2.073 0.03839 *
## MSA_Q_02 0.0091648 0.0197516 0.464 0.64272
## MSA_Q_03 -0.0103725 0.0174358 -0.595 0.55201
## MSA_Q_04 0.0027707 0.0170532 0.162 0.87095
## MSA_Q_05 -0.0014824 0.0185146 -0.080 0.93619
## MSA_Q_06 0.0275880 0.0181914 1.517 0.12960
## MSA_Q_07 -0.0086310 0.0168410 -0.513 0.60838
## MSA_Q_08 -0.0074402 0.0223500 -0.333 0.73926
## MSA_Q_09 -0.0060522 0.0179840 -0.337 0.73652
## MSA_Q_10 -0.0031801 0.0187496 -0.170 0.86534
## MSA_Q_11 -0.0062949 0.0136786 -0.460 0.64544
## MSA_Q_12 0.0039744 0.0164366 0.242 0.80897
## MSA_Q_13 0.0159564 0.0179037 0.891 0.37295
## MSA_Q_14 -0.0017463 0.0188061 -0.093 0.92603
## MSA_Q_15 -0.0117920 0.0182987 -0.644 0.51941
## MSA_Q_16 -0.0108404 0.0172565 -0.628 0.52997
## MSA_Q_17 0.0143954 0.0166083 0.867 0.38622
## PH2_Q_01 0.0159562 0.0205568 0.776 0.43776
## PH2_Q_02 0.0140759 0.0225818 0.623 0.53317
## TOS_Q_01 0.0005752 0.0230805 0.025 0.98012
## TOS_Q_02 -0.0281588 0.0168897 -1.667 0.09569 .
## TOS_Q_03 -0.0161438 0.0704074 -0.229 0.81868
## TOS_Q_04 0.0193578 0.0554551 0.349 0.72709
## CHARLSONSCORE 0.1182002 0.0166371 7.105 1.89e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7951 on 1447 degrees of freedom
## Multiple R-squared: 0.1145, Adjusted R-squared: 0.08822
## F-statistic: 4.353 on 43 and 1447 DF, p-value: < 2.2e-16
## Train data has identical significant variable when compared with unsplit new data.
library(rpart)
##
## Attaching package: 'rpart'
## The following object is masked from 'package:faraway':
##
## solder
ql.rpart<-rpart(CHRONICDISEASESCORE~., data=ql_train)
ql.rpart
## n= 1491
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 1491 1033.1160 0.9644534
## 2) CHARLSONSCORE< 0.5 560 267.7839 0.6053571
## 4) MSA_Q_02< 3.5 336 127.9732 0.4910714 *
## 5) MSA_Q_02>=3.5 224 128.8393 0.7767857 *
## 3) CHARLSONSCORE>=0.5 931 649.6842 1.1804510
## 6) MSA_Q_02< 3.5 602 339.9286 1.0714290 *
## 7) MSA_Q_02>=3.5 329 289.5076 1.3799390 *
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.2.3
rpart.plot(ql.rpart, digits=4)
ql.p<-predict(ql.rpart, ql_test)
summary(ql.p)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.4911 0.7768 1.0714 0.9779 1.0714 1.3799
cor(ql.p, ql_test$CHRONICDISEASESCORE)
## [1] 0.3006661
##Poorly correlated (0.30) with their true value counterparts.
MAE<-function(obs, pred){
mean(abs(obs-pred))
}
MAE(ql_test$CHRONICDISEASESCORE,ql.p)
## [1] 0.6313087
## 0.6313
MAE(ql_test$CHRONICDISEASESCORE,mean(ql_test$CHRONICDISEASESCORE))
## [1] 0.5834767
## 0.5835
##Regression tree model and OLS model serve different purposes. Regression Tree model classify predictors by conditions. For example, Charlson Score are split with options of Yes or No by MSA_Q_02, depending on score. Charlson Score of Less than 1 and MSA_Q_02 score of less han 4 have an average score of 0.4911. OLS estimates coefficient of variables which describe the relationship between the predictors and response variable.Try to use M5P to improve the model.
Sys.getenv("JAVA_HOME")
## [1] "C:/Program Files (x86)/Java/jre1.8.0_361"
Sys.setenv(JAVA_HOME="C:/Program Files (x86)/Java/jre1.8.0_361")
library(rJava)
library(RWeka)
## Warning: package 'RWeka' was built under R version 4.2.3
ql.m5 <- M5P(CHRONICDISEASESCORE~., data=ql_train)
ql.m5
## M5 pruned model tree:
## (using smoothed linear models)
##
## CHARLSONSCORE <= 0.5 :
## | MSA_Q_01 <= 3.5 : LM1 (418/71.737%)
## | MSA_Q_01 > 3.5 : LM2 (142/90.248%)
## CHARLSONSCORE > 0.5 :
## | MSA_Q_02 <= 3.5 : LM3 (602/88.29%)
## | MSA_Q_02 > 3.5 :
## | | PH2_Q_02 <= 1.5 :
## | | | MSA_Q_06 <= 2.5 : LM4 (80/80.003%)
## | | | MSA_Q_06 > 2.5 : LM5 (46/81.21%)
## | | PH2_Q_02 > 1.5 : LM6 (203/111.651%)
##
## LM num: 1
## CHRONICDISEASESCORE =
## -0.2061 * LANGUAGE=SPANISH
## + 0.1008 * RACE_ETHNICITY=WHITE,AI-ALN,UNKNOWN,HISPANIC
## + 0.0045 * RACE_ETHNICITY=UNKNOWN,HISPANIC
## + 0.0054 * SEX=FEMALE
## + 0.091 * QOL_Q_02
## + 0.0033 * QOL_Q_06
## + 0.023 * QOL_Q_07
## - 0.0014 * QOL_Q_08
## - 0.0013 * QOL_Q_09
## + 0.0023 * MSA_Q_01
## + 0.0011 * MSA_Q_02
## - 0.0013 * MSA_Q_05
## + 0.0018 * MSA_Q_06
## - 0.0505 * MSA_Q_07
## - 0.0356 * MSA_Q_09
## - 0.0016 * MSA_Q_10
## - 0.032 * MSA_Q_14
## + 0.0633 * MSA_Q_17
## + 0.0007 * PH2_Q_01
## - 0.0018 * TOS_Q_02
## + 0.0031 * CHARLSONSCORE
## + 0.2525
##
## LM num: 2
## CHRONICDISEASESCORE =
## -0.0137 * LANGUAGE=SPANISH
## + 0.0083 * RACE_ETHNICITY=WHITE,AI-ALN,UNKNOWN,HISPANIC
## + 0.0045 * RACE_ETHNICITY=UNKNOWN,HISPANIC
## + 0.2377 * SEX=FEMALE
## + 0.0106 * QOL_Q_02
## + 0.1524 * QOL_Q_06
## + 0.0027 * QOL_Q_07
## - 0.0037 * QOL_Q_08
## - 0.0013 * QOL_Q_09
## + 0.0043 * MSA_Q_01
## + 0.0032 * MSA_Q_02
## - 0.0035 * MSA_Q_05
## + 0.005 * MSA_Q_06
## - 0.0029 * MSA_Q_07
## - 0.0036 * MSA_Q_09
## - 0.0841 * MSA_Q_10
## + 0.1118 * PH2_Q_01
## - 0.1096 * TOS_Q_01
## - 0.1042 * TOS_Q_02
## + 0.0031 * CHARLSONSCORE
## + 0.4688
##
## LM num: 3
## CHRONICDISEASESCORE =
## 0.0023 * RACE_ETHNICITY=WHITE,AI-ALN,UNKNOWN,HISPANIC
## + 0.2113 * RACE_ETHNICITY=UNKNOWN,HISPANIC
## + 0.0033 * SEX=FEMALE
## + 0.0938 * QOL_Q_02
## - 0.0425 * QOL_Q_04
## + 0.0426 * QOL_Q_06
## + 0.0009 * QOL_Q_07
## - 0.0647 * QOL_Q_09
## + 0.0412 * MSA_Q_01
## + 0.0008 * MSA_Q_02
## + 0.0004 * PH2_Q_01
## + 0.0008 * PH2_Q_02
## - 0.0013 * TOS_Q_02
## + 0.0019 * CHARLSONSCORE
## + 0.7686
##
## LM num: 4
## CHRONICDISEASESCORE =
## 0.0041 * RACE_ETHNICITY=WHITE,AI-ALN,UNKNOWN,HISPANIC
## + 0.0518 * RACE_ETHNICITY=UNKNOWN,HISPANIC
## + 0.028 * SEX=FEMALE
## + 0.0035 * QOL_Q_02
## - 0.0841 * QOL_Q_04
## + 0.0153 * QOL_Q_06
## + 0.0073 * QOL_Q_07
## + 0.1033 * QOL_Q_08
## - 0.0042 * QOL_Q_09
## + 0.0027 * MSA_Q_01
## + 0.0014 * MSA_Q_02
## + 0.0215 * MSA_Q_05
## + 0.0147 * MSA_Q_06
## - 0.0182 * MSA_Q_11
## + 0.008 * MSA_Q_13
## + 0.0084 * MSA_Q_14
## - 0.0058 * MSA_Q_15
## + 0.0004 * PH2_Q_01
## + 0.0015 * PH2_Q_02
## + 0.0098 * TOS_Q_01
## - 0.0108 * TOS_Q_02
## + 0.0019 * CHARLSONSCORE
## + 0.8506
##
## LM num: 5
## CHRONICDISEASESCORE =
## 0.0041 * RACE_ETHNICITY=WHITE,AI-ALN,UNKNOWN,HISPANIC
## + 0.0518 * RACE_ETHNICITY=UNKNOWN,HISPANIC
## + 0.028 * SEX=FEMALE
## + 0.0035 * QOL_Q_02
## + 0.0153 * QOL_Q_06
## + 0.0073 * QOL_Q_07
## - 0.0042 * QOL_Q_09
## + 0.0027 * MSA_Q_01
## + 0.0014 * MSA_Q_02
## - 0.0895 * MSA_Q_03
## + 0.3383 * MSA_Q_05
## + 0.0229 * MSA_Q_06
## - 0.0241 * MSA_Q_11
## + 0.008 * MSA_Q_13
## + 0.0084 * MSA_Q_14
## - 0.0058 * MSA_Q_15
## + 0.0004 * PH2_Q_01
## + 0.0015 * PH2_Q_02
## + 0.0098 * TOS_Q_01
## - 0.0108 * TOS_Q_02
## + 0.0019 * CHARLSONSCORE
## + 0.772
##
## LM num: 6
## CHRONICDISEASESCORE =
## 0.0041 * RACE_ETHNICITY=WHITE,AI-ALN,UNKNOWN,HISPANIC
## + 0.4092 * RACE_ETHNICITY=UNKNOWN,HISPANIC
## + 0.0199 * SEX=FEMALE
## + 0.0035 * QOL_Q_02
## + 0.2416 * QOL_Q_06
## + 0.0052 * QOL_Q_07
## - 0.0042 * QOL_Q_09
## + 0.086 * MSA_Q_01
## + 0.0014 * MSA_Q_02
## - 0.0805 * MSA_Q_04
## - 0.0822 * MSA_Q_11
## + 0.0051 * MSA_Q_13
## + 0.0055 * MSA_Q_14
## - 0.0037 * MSA_Q_15
## + 0.0004 * PH2_Q_01
## + 0.0015 * PH2_Q_02
## + 0.0063 * TOS_Q_01
## - 0.0076 * TOS_Q_02
## + 0.0019 * CHARLSONSCORE
## + 0.5968
##
## Number of Rules : 6
summary(ql.m5)
##
## === Summary ===
##
## Correlation coefficient 0.4914
## Mean absolute error 0.566
## Root mean squared error 0.7252
## Relative absolute error 94.0868 %
## Root relative squared error 87.1244 %
## Total Number of Instances 1491
#Correlation coefficient 0.4914
#Mean absolute error 0.566
### M5 model has higher Correlation and lower MAE of than prediction tree. Thus the model is optimized.
ql.p.m5<-predict(ql.m5, ql_test)
summary(ql.p.m5)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.07583 0.69591 1.01151 1.01569 1.23885 2.56245
cor(ql.p.m5, ql_test$CHRONICDISEASESCORE)
## [1] 0.2998408
#0.2998
MAE(ql_test$CHRONICDISEASESCORE, ql.p.m5)
## [1] 0.6413623
#0.641